#this is just to remove the warnings
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)
#installing some libraries
!pip install pandas_profiling
!pip install xgboost
#import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pandas_profiling
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.preprocessing import Imputer,OneHotEncoder,LabelEncoder,StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
def import_data():
train=pd.read_csv("Challenge Data/train.csv")
test=pd.read_csv("Challenge Data/test.csv")
return train,test
train,test=import_data()
train.head(5)
test.head(5)
pandas_profiling.ProfileReport(train)
#get the IDs
train_ID=train.Id
test_ID=test.Id
#get sales price
y=train.SalePrice
#drop the non-feature columns
train=train.drop(['Id','SalePrice'],axis=1)
test=test.drop('Id',axis=1)
index=train.shape[0]
#combine both dataframes
combined=train.append(test)
#this is the number of elements for each feature
combined.describe().iloc[0,:]
combined.LotFrontage.describe()
combined.LotFrontage.fillna(combined.LotFrontage.mean(),inplace=True)
combined.LotFrontage.describe()
combined.MasVnrArea.describe()
combined.MasVnrArea.fillna(combined.MasVnrArea.mean(),inplace=True)
combined.GarageYrBlt.describe()
combined.GarageYrBlt.fillna(combined.GarageYrBlt.mean(),inplace=True)
combined.describe().iloc[0,:]
combined['SalePrice']=y
combined.corr().SalePrice.sort_values(ascending=False)
combined.drop(['PoolArea','MoSold','BsmtFinSF2','3SsnPorch','BsmtHalfBath','LowQualFinSF','YrSold','MiscVal','OverallCond','MSSubClass','SalePrice'],axis=1,inplace=True)
combined.Alley.isna().sum()
combined.isna().sum().sort_values(ascending=False)
combined.fillna("None",inplace=True)
#hot encode the data
combined=pd.get_dummies(combined)
combined.isnull().sum().sort_values(ascending=False)
mtrain=combined.iloc[:index,:]
mtest=combined.iloc[index:,:]
#this function takes an input as a model and prints the average rmse of k-fold cross validation
def model_score(model,mtrain,y,cv=50):
#mse are by default negative in sklearn
mse=-cross_val_score(model,mtrain,y,cv=50,scoring='neg_mean_squared_error')
#take square root
rmse=np.sqrt(mse)
#average
average_rmse=np.mean(rmse)
print("The average RMSE is: %f." %rmse[0])
#Linear Regressor
regressor=LinearRegression()
regressor.fit(mtrain,y)
model_score(regressor,mtrain,y)
#random forest
rf=RandomForestRegressor(n_estimators=100)
rf.fit(mtrain,y)
model_score(rf,mtrain,y)
#XGBoost
xgb=XGBRegressor()
xgb.fit(mtrain,y)
model_score(xgb,mtrain,y)
#SVR
svr=SVR(kernel='rbf')
svr.fit(mtrain,y)
model_score(svr,mtrain,y)
#we will the same models but we will apply PCA to the data
pca=PCA(n_components=100)
pca_mtrain=pca.fit_transform(mtrain)
regressor.fit(pca_mtrain,y)
model_score(regressor,pca_mtrain,y)
rf.fit(pca_mtrain,y)
model_score(rf,pca_mtrain,y)
xgb.fit(pca_mtrain,y)
model_score(xgb,pca_mtrain,y)
svr=SVR(kernel='rbf')
svr.fit(pca_mtrain,y)
model_score(svr,mtrain,y)
def import_data():
train=pd.read_csv("Challenge Data/train.csv")
test=pd.read_csv("Challenge Data/test.csv")
return train,test
train,test=import_data()
#get IDs
train_ID=train.Id
test_ID=test.Id
#get SalePrices
y=train.SalePrice
train.shape
#drop the unwanted columns
train.drop(['Id','SalePrice'],axis=1,inplace=True)
test.drop('Id',axis=1,inplace=True)
train.shape
#total number of rows of dataset
n_rows=train.shape[0]+test.shape[0]
#proportions of number of rows that have missing data to total number of rows
test_miss=test.isna().sum()/n_rows
train_miss=train.isna().sum()/n_rows
plt.figure(figsize=(13,7))
plt.title("Percentage of missing values in training dataset")
train_miss[train_miss>0].sort_values(ascending=True).plot(kind="barh")
plt.xlabel("Percentage")
plt.show()
plt.figure(figsize=(13,7))
plt.title("Percentage of missing values in test dataset")
test_miss[test_miss>0].sort_values(ascending=True).plot(kind="barh")
plt.xlabel("Percentage")
plt.show()
plt.figure(figsize=(13,7))
sns.distplot(y)
plt.title("Distribution Plot of SalePrice")
print("Kurtosis: %f" %y.kurt())
print("Skewness: %f" %y.skew())
plt.figure(figsize=(13,7))
log_y=np.log(y)
sns.distplot(log_y)
plt.show()
print("Kurtosis: %f" %log_y.kurt())
print("Skewness: %f" %log_y.skew())
def get_features(train):
#select all numerical feature names
numerical_features=train.select_dtypes(include=np.number).columns
#select all categorical feature names
categorical_features=train.select_dtypes(exclude=np.number).columns
return numerical_features,categorical_features
numerical_features,categorical_features=get_features(train)
train.hist(figsize=(25,20))
plt.show()
#dictionary contraing features name as keys and correlation with sale price as value
corr_dict=train[numerical_features].corrwith(y).sort_values(ascending=False).to_dict()
#graphs are sorted by alphabetical order of the feature names
i=1
plt.figure(figsize=(55, 60))
for k in sorted(corr_dict):
plt.subplot(6,6,i)
plt.scatter(train[k],y)
plt.title("Correlation: %f" %corr_dict[k],fontsize=30)
plt.xlabel(k,fontsize=25)
plt.ylabel("Sale Prices",fontsize=25)
i+=1
#tryong colors for heat map
sns.palplot(sns.color_palette("RdBu_r", 6))
plt.subplots(figsize=(15,10))
ax=sns.heatmap(train.corr(),cmap=sns.color_palette("RdBu_r", 5),linewidths=.5,vmin=-1,vmax=1)
cbar = ax.collections[0].colorbar
cbar.set_ticks(np.arange(-1,1.1,0.2))
plt.title("Heatmap of Numerical Features")
plt.show()
i=1
plt.figure(figsize=(80, 350))
for cat in sorted(categorical_features):
plt.subplot(22,2,i)
#this is just for the xlabels to not overlap
if(cat=="Neighborhood"):
plt.xticks(rotation=30)
#stripplot generalizes scatter to categorical variables
sns.stripplot(train[cat],y)
plt.xlabel(cat,fontsize=30)
plt.ylabel("Sale Prices",fontsize=30)
plt.xticks(fontsize=27)
plt.yticks(fontsize=27)
i+=1
miss_train=train.isna().sum().sort_values(ascending=False)
miss_train[miss_train>0]
miss_test=test.isna().sum().sort_values(ascending=False)
miss_test[miss_test>0]
#used to slice test from train
index=train.shape[0]
index
full=pd.concat([train,test],ignore_index=True)
miss_full=full[numerical_features].isna().sum().sort_values(ascending=False)
miss_full=miss_full[miss_full>0]
miss_full
miss_full_cols=miss_full.index
pandas_profiling.ProfileReport(full[miss_full_cols])
full[full.YearBuilt==1880][['YearBuilt','GarageYrBlt']]
full[full.GarageYrBlt<full.YearBuilt][['YearBuilt','GarageYrBlt']]
plt.figure(figsize=(13,9))
full.GarageYrBlt.hist()
plt.title("Distribution Plot of GarageYrBlt")
plt.show()
#fill missing values by 0
full.GarageYrBlt.fillna(0,inplace=True)
full.LotFrontage.fillna(full.LotFrontage.mode()[0],inplace=True)
full.MasVnrArea.fillna(0,inplace=True)
full[numerical_features].isnull().sum().sort_values(ascending=False)>0
#number of missing values for categorical features
miss_full_cat=full[categorical_features].isna().sum().sort_values(ascending=False)
miss_full_cat=miss_full_cat[miss_full_cat>0]
miss_full_cat
full["Electrical"].describe()
miss_full_cat['Electrical']
print("Unique Values for the feature Electrical:", full["Electrical"].unique())
print("Number of missing values for the feature Electrical:", miss_full_cat["Electrical"])
print("\n")
plt.figure(figsize=(13,9))
full.Electrical.hist()
plt.title("Distribution plot of Electrical")
plt.show()
full[full.Electrical.isnull()>0]
full.Electrical.fillna('SBrkr',inplace=True)
#sum all basement area
bsmtsum=full.BsmtFinSF1+full.BsmtFinSF2+full.BsmtUnfSF
#compare to total sum
bsmtsum.equals(full.TotalBsmtSF)
full.shape
full.drop('BsmtFinSF1',axis=1,inplace=True)
full.drop('BsmtFinSF2',axis=1,inplace=True)
full.drop('BsmtUnfSF',axis=1,inplace=True)
(full['1stFlrSF']+full['2ndFlrSF']+full.LowQualFinSF).equals(full.GrLivArea)
full.drop('LowQualFinSF',axis=1,inplace=True)
full.drop('1stFlrSF',axis=1,inplace=True)
full.drop('2ndFlrSF',axis=1,inplace=True)
(full['3SsnPorch']+full.EnclosedPorch+full.OpenPorchSF+full.ScreenPorch).iloc[:index].corr(y)
full.drop('MiscVal',axis=1,inplace=True)
i=1
plt.figure(figsize=(50, 250))
for cat in sorted(categorical_features):
plt.subplot(22,2,i)
#this is just for the xlabels to not overlap
if(cat in ["Neighborhood","Exterior1st","Exterior2nd"]):
plt.xticks(rotation=30)
full[cat].hist()
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.title("{}".format(cat),size=20)
#plt.show()
i+=1
full.drop('Exterior1st',axis=1,inplace=True)
full.drop('Utilities',axis=1,inplace=True)
full.isnull().sum().sort_values(ascending=False)
full.fillna("None",inplace=True)
full_wd=full.copy() #wd=with droping first dummy variable column
full_wod=full.copy()#wod=without dropping first dummy variable column
#dummy encode
full_wd=pd.get_dummies(full_wd)
full_wod=pd.get_dummies(full_wod,drop_first=True)
#split to train and test
train_wd=full_wd.iloc[:index,:]
test_wd=full_wd.iloc[index:,:]
#split to train and test
train_wod=full_wod.iloc[:index,:]
test_wod=full_wod.iloc[index:,:]
from sklearn.cross_validation import cross_val_score
def model_score(model,mtrain,y,cv=50):
#mse are by default negative in sklearn
mse=-cross_val_score(model,mtrain,y,cv=cv,scoring='neg_mean_squared_error')
rmse=np.sqrt(mse)
average_rmse=np.mean(rmse)
std=np.std(rmse)
#print(rmse)
print("The average RMSE is: %f with a standard deviation of %f." %(average_rmse,std))
#linear regressor: train_wd+log_y
lm1=LinearRegression()
lm1.fit(train_wd,log_y)
model_score(lm1,train_wd,log_y)
#linear regressor: train_wod+log_y
lm2=LinearRegression()
lm2.fit(train_wod,log_y)
model_score(lm2,train_wod,log_y)
#random forest: train_wod+log_y
rlf=RandomForestRegressor()
rlf.fit(train_wod,log_y)
model_score(rlf,train_wod,log_y)
#random forest: train_wd+log_y
rlf2=RandomForestRegressor()
rlf2.fit(train_wd,log_y)
model_score(rlf2,train_wd,log_y)
#XGBoost: train_wod+log_y
xgb=XGBRegressor()
xgb.fit(train_wod,log_y)
model_score(xgb,train_wod,log_y)
#XGBoost: train_wdd+log_y
xgb=XGBRegressor()
xgb.fit(train_wd,log_y)
model_score(xgb,train_wd,log_y)
train,test=import_data()
#get IDs
train_ID=train.Id
test_ID=test.Id
#get SalePrices
y=train.SalePrice
#drop the unwanted columns
train.drop(['Id','SalePrice'],axis=1,inplace=True)
test.drop('Id',axis=1,inplace=True)
#log the target
log_y=np.log(y)
numerical_features,categorical_features=get_features(train)
#combine train and test data
full=pd.concat([train,test],ignore_index=True)
num_miss=full[numerical_features].isna().sum().sort_values(ascending=False)
num_miss[num_miss>0]
df=full[['LotFrontage','Neighborhood']]
neighb=df.Neighborhood.unique()
neighb
for n in neighb:
df[df.Neighborhood==n].plot.bar(figsize=(10,5),xticks=None,color = "skyblue")
plt.tick_params(axis='x',which='both',bottom='off',top='off',labelbottom='off')
plt.title("Lot Frontages for %s" %n)
plt.axhline(y=df[df.Neighborhood==n].mean()[0], color='r', linestyle='--',label='Mean')
plt.ylabel("Lot Frontage")
plt.legend()
full.LotFrontage=full.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(round(x.mean())))
#check for missing values
full.LotFrontage.isna().sum()
garage_features=[n for n in full.columns if 'Garage' in n]
garage_features
full[garage_features].isna().sum()
full[full.GarageType.isna()].GarageCars.sum()
full[full.GarageCars==0].shape[0]
full[full.GarageType.isna()].GarageArea.sum()
full[full.GarageArea==0].shape[0]
full.GarageType.isna().equals(full.GarageYrBlt.isna())
full.GarageType.isna().equals(full.GarageCond.isna())
full.GarageType.isna().equals(full.GarageQual.isna())
full.GarageYrBlt.corr(y)
full.GarageYrBlt.fillna(0).corr(y)
full.GarageYrBlt.corr(full.YearBuilt)
full.drop('GarageYrBlt',axis=1,inplace=True)
mason_feat=[ind for ind in full.columns if 'Mas' in ind]
full[mason_feat].isna().sum()
full.MasVnrArea.fillna(0,inplace=True)
numerical_features,categorical_features=get_features(full)
full[numerical_features].isna().sum().sort_values(ascending=False)
full[categorical_features].isna().sum().sort_values(ascending=False)
full.PoolArea[full.PoolQC.isna()].sum()
full[full.PoolArea==0].equals(full[full.PoolQC.isna()])
full.PoolQC.fillna('No Pool',inplace=True)
plt.figure(figsize=(10,7))
full.MiscFeature.hist()
plt.xlabel("MiscFeature")
plt.ylabel("Count")
plt.show()
full.MiscFeature.fillna("None",inplace=True)
plt.figure(figsize=(10,7))
full.MiscFeature.hist()
plt.xlabel("MiscFeature")
plt.ylabel("Count")
plt.show()
plt.figure(figsize=(10,7))
full.Alley.hist()
plt.xlabel("Alley")
plt.ylabel("Count")
plt.show()
full.Alley.fillna("No Alley",inplace=True)
plt.figure(figsize=(10,7))
full.Alley.hist()
plt.xlabel("Alley")
plt.ylabel("Count")
plt.show()
plt.figure(figsize=(10,7))
full.Fence.fillna("No Fence",inplace=True)
full.Fence.hist()
plt.xlabel("Fence")
plt.ylabel("Count")
plt.show()
plt.figure(figsize=(10,7))
full.FireplaceQu.hist()
plt.xlabel("Fireplace Quality")
plt.ylabel("Count")
plt.show()
full[full.FireplaceQu.isna()].equals(full[full.Fireplaces==0])
full.FireplaceQu.fillna("NA",inplace=True)
plt.figure(figsize=(10,7))
full.FireplaceQu.hist()
plt.xlabel("Fireplace Quality")
plt.ylabel("Count")
plt.show()
full.GarageCond.fillna('No Garage',inplace=True)
full.GarageQual.fillna('No Garage',inplace=True)
full.GarageFinish.fillna('No Garage',inplace=True)
full.GarageType.fillna('No Garage',inplace=True)
#just to check
full[full.GarageArea==0].equals(full[full.GarageCond=='No Garage'])
full.TotalBsmtSF.isna().sum()
bsmt_features=[ind for ind in full.columns if 'Bsmt' in ind]
full[bsmt_features].isna().sum().sort_values(ascending=False)
full[bsmt_features][full.BsmtFinType1.isna()].shape
full[bsmt_features][full.BsmtFinType2.isna()].shape
i=full[bsmt_features][full.BsmtFinType2.isna()].index.difference(full[bsmt_features][full.BsmtFinType1.isna()].index)[0]
i
pd.DataFrame(full[bsmt_features].iloc[i,:])
full.BsmtFinType2.hist()
plt.show()
full[['BsmtFinSF2','BsmtFinType2']][full.BsmtFinType2=='Unf']
[(ind,full.BsmtFinType2[full.BsmtFinType2==ind].count()) for ind in full.BsmtFinType2.unique()]
full.BsmtFinType2.iloc[i]='Rec'
i=full[bsmt_features][full.BsmtExposure.isna()].index.difference(full[bsmt_features][full.BsmtFinType1.isna()].index)[0]
i
pd.DataFrame(full[bsmt_features].iloc[i,:])
full.BsmtExposure.hist()
plt.show()
full.BsmtExposure.iloc[i]='No'
full[bsmt_features].isna().sum()
#check if missing rows are the same for categorical features
full[full.BsmtQual.isna()].equals(full[full.BsmtFinType1.isna()])\
and full[full.BsmtQual.isna()].equals(full[full.BsmtFinType2.isna()])\
and full[full.BsmtQual.isna()].equals(full[full.BsmtCond.isna()]) \
and full[full.BsmtQual.isna()].equals(full[full.BsmtExposure.isna()])
#check for numerical variable
full[full.BsmtQual.isna()].equals(full[full.TotalBsmtSF==0])
full.BsmtQual.fillna('No Basement',inplace=True)
full.BsmtCond.fillna('No Basement',inplace=True)
full.BsmtFinType1.fillna('No Basement',inplace=True)
full.BsmtExposure.fillna('No Basement',inplace=True)
full.BsmtFinType2.fillna('No Basement',inplace=True)
#check if we have any missing values for basement features
full[bsmt_features].isna().sum()
full.MasVnrType.fillna("No Masonry Veneer",inplace=True)
full.Electrical.hist()
plt.xlabel("Electrical")
plt.ylabel("Count")
plt.show()
full.Electrical.fillna('SBrkr',inplace=True)
#check to see if we have any missing values
full.isna().sum().sort_values(ascending=False)
#we will keep a copy of the cleaned data for future use
full_cleaned=full.copy()
#full=full_cleaned.copy()
full.PoolQC.unique()
dict1={'No Pool':0,'Fa':1,'Gd':2,'Ex':3}
full.PoolQC=full.PoolQC.apply(lambda x: dict1[x])
full.MiscFeature.hist()
plt.xlabel("MiscFeatures")
plt.ylabel("Count")
plt.show()
a=full.MiscFeature.apply(lambda x: 0 if x=='None' else 1)
a.corr(y)
delete=[] #list to store features to be deleted
delete.append("MiscFeature")
full.Alley.hist()
plt.xlabel("Alley")
plt.ylabel("Count")
plt.show()
a=full.Alley.apply(lambda x: 0 if x=='No Alley' else 1)
a.corr(y)
delete.append('Alley')
full.Fence.hist()
plt.xlabel("Fence")
plt.ylabel("Count")
plt.show()
full.FireplaceQu.hist()
plt.xlabel("Fireplace Quality")
plt.ylabel("Count")
plt.show()
dict1={'NA':0,'Po':1,'TA':2,'Fa':3,'Gd':4,'Ex':5}
full.FireplaceQu=full.FireplaceQu.apply(lambda x: dict1[x])
full.GarageCond.hist()
plt.xlabel("GarageCond")
plt.ylabel("Count")
plt.show()
#replace by numbers
dict1={'No Garage':0,'Po':1,'TA':2,'Fa':3,'Gd':4,'Ex':5}
full.GarageCond=full.GarageCond.apply(lambda x: dict1[x])
full.GarageQual.hist()
plt.xlabel("GarageQual")
plt.ylabel("Count")
plt.show()
full.GarageQual=full.GarageQual.apply(lambda x: dict1[x])
full.GarageFinish.hist()
plt.xlabel("GarageFinish")
plt.ylabel("Count")
plt.show()
#a finished garage cost more
dict1={'No Garage':0,'Unf':1,'RFn':2,'Fin':3}
full.GarageFinish=full.GarageFinish.apply(lambda x: dict1[x])
full.BsmtFinType2.unique()
#higher basement rating means higher price
dict1={'No Basement':0, 'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4, 'ALQ':5, 'GLQ':6}
full.BsmtFinType1=full.BsmtFinType1.apply(lambda x: dict1[x])
full.BsmtFinType2=full.BsmtFinType2.apply(lambda x: dict1[x])
full.BsmtExposure.unique()
dict1={'No Basement':0, 'No':1, 'Mn':2, 'Av':3, 'Gd':4}
full.BsmtExposure=full.BsmtExposure.apply(lambda x: dict1[x])
full.BsmtQual.unique()
dict1={'No Basement':0,'Po':1,'TA':2,'Fa':3,'Gd':4,'Ex':5}
full.BsmtQual=full.BsmtQual.apply(lambda x: dict1[x])
full.BsmtCond.unique()
full.BsmtCond=full.BsmtCond.apply(lambda x: dict1[x])
full.MSZoning.hist()
plt.xlabel("MSZoning")
plt.ylabel("Count")
plt.show()
full.Street.hist()
plt.xlabel("Street")
plt.ylabel("Count")
plt.show()
delete.append('Street')
full.LotShape.hist()
plt.xlabel("LotShape")
plt.ylabel("Count")
plt.show()
full.LandContour.hist()
plt.xlabel("LandContour")
plt.ylabel("Count")
plt.show()
full.Utilities.hist()
plt.xlabel("Utilities")
plt.ylabel("Count")
plt.show()
full[full.Utilities!="AllPub"]
delete.append("Utilities")
full.LotConfig.hist()
plt.xlabel("LotConfig")
plt.ylabel("Count")
plt.show()
full.LandSlope.hist()
plt.xlabel("LandSlope")
plt.ylabel("Count")
plt.show()
delete.append("LandSlope")
full.Neighborhood.hist()
plt.xlabel("Neighborhood")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.show()
full.Condition1.hist()
plt.xlabel("Condition1")
plt.ylabel("Count")
plt.show()
delete.append("Condition1")
full.Condition2.hist()
plt.xlabel("Condition2")
plt.ylabel("Count")
plt.show()
delete.append("Condition2")
full.BldgType.hist()
plt.xlabel("BldgType")
plt.ylabel("Count")
plt.show()
full.HouseStyle.hist()
plt.xlabel("HouseStyle")
plt.ylabel("Count")
plt.show()
full.RoofStyle.hist()
plt.xlabel("RoofStyle")
plt.ylabel("Count")
plt.show()
full.RoofMatl.hist()
plt.xlabel("RoofMatl")
plt.ylabel("Count")
plt.show()
delete.append("RoofMatl")
full.Exterior1st.hist()
plt.xlabel("Exterior1st")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.show()
full.Exterior2nd.hist()
plt.xlabel("Exterior2nd")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.show()
delete.append('Exterior2nd')
full.MasVnrType.hist()
plt.xlabel("MasVnrType")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.show()
full.ExterQual.hist()
plt.xlabel("ExterQual")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.show()
dict1={'Po':1,'TA':2,'Fa':3,'Gd':4,'Ex':5}
full.ExterQual=full.ExterQual.apply(lambda x: dict1[x])
full.ExterCond.hist()
plt.xlabel("ExterCond")
plt.ylabel("Count")
plt.show()
full.ExterCond=full.ExterCond.apply(lambda x: dict1[x])
full.Foundation.hist()
plt.xlabel("Foundation")
plt.ylabel("Count")
plt.show()
full.Heating.hist()
plt.xlabel("Heating")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.show()
delete.append("Heating")
full.HeatingQC.hist()
plt.xlabel("HeatingQC")
plt.ylabel("Count")
plt.show()
full.HeatingQC=full.HeatingQC.apply(lambda x: dict1[x])
full.CentralAir.hist()
plt.xlabel("CentralAir")
plt.ylabel("Count")
plt.show()
full.CentralAir=full.CentralAir.apply(lambda x: 1 if x=='Y' else 0)
full.Electrical.hist()
plt.xlabel("Electrical")
plt.ylabel("Count")
plt.show()
delete.append("Electrical")
full.KitchenQual.hist()
plt.xlabel("KitchenQual")
plt.ylabel("Count")
plt.show()
full.KitchenQual=full.KitchenQual.apply(lambda x: dict1[x])
full.Functional.hist()
plt.xlabel("Functional")
plt.ylabel("Count")
plt.show()
delete.append("Functional")
full.GarageType.hist()
plt.xlabel("GarageType")
plt.ylabel("Count")
plt.show()
full.PavedDrive.hist()
plt.xlabel("PavedDrive")
plt.ylabel("Count")
plt.show()
delete.append("PavedDrive")
full.SaleType.isna().sum()
full.SaleType.hist()
plt.xlabel("SaleType")
plt.ylabel("Count")
plt.show()
full.SaleCondition.hist()
plt.xlabel("Sale Condition")
plt.ylabel("Count")
plt.show()
(full.BsmtFinSF1+full.BsmtFinSF2+full.BsmtUnfSF).equals(full.TotalBsmtSF)
delete.append("BsmtFinSF1")
delete.append("BsmtFinSF2")
delete.append("BsmtUnfSF")
(full['1stFlrSF']+full['2ndFlrSF']+full.LowQualFinSF).equals(full.GrLivArea)
delete.append("1stFlrSF")
delete.append("2ndFlrSF")
delete.append("LowQualFinSF")
print(full.FullBath.corr(y))
print(full.HalfBath.corr(y))
print(full.BsmtFullBath.corr(y))
print(full.BsmtHalfBath.corr(y))
full['Total_Bathrooms']=(full.FullBath+full.HalfBath*0.5+full.BsmtFullBath+full.BsmtHalfBath*0.5)
full.Total_Bathrooms.corr(y)
delete.append("FullBath")
delete.append("HalfBath")
delete.append("BsmtFullBath")
delete.append("BsmtHalfBath")
print(full.OpenPorchSF.corr(y))
print(full.EnclosedPorch.corr(y))
print(full['3SsnPorch'].corr(y))
print(full.ScreenPorch.corr(y))
(full.OpenPorchSF+full.EnclosedPorch+full['3SsnPorch']+full.ScreenPorch).corr(y)
full['TotalPorchSF']=full.OpenPorchSF+full.EnclosedPorch+full['3SsnPorch']+full.ScreenPorch
delete.append("OpenPorchSF")
delete.append("EnclosedPorch")
delete.append("3SsnPorch")
delete.append("ScreenPorch")
full.drop(delete,axis=1,inplace=True)
full['Age']=full.YrSold.astype('int')-full.YearRemodAdd.astype('int')
plt.subplots(figsize=(15,10))
ax=sns.heatmap(full.corr(),cmap=sns.color_palette("RdBu_r", 5),linewidths=.7)
cbar = ax.collections[0].colorbar
cbar.set_ticks(np.arange(-1,1.1,0.2))
plt.title("Heatmap of Numerical Features")
plt.show()
#compare which of the highly correlated features has a higher correlation with the price
full.PoolArea.corr(y)>full.PoolQC.corr(y)
#compare which of the highly correlated features has a higher correlation
full.GarageQual.corr(y)>full.GarageCond.corr(y)
#compare which of the highly correlated features has a higher correlation
full.GarageArea.corr(y)>full.GarageCars.corr(y)
#compare which of the highly correlated features has a higher correlation
full.ExterQual.corr(y)>full.KitchenQual.corr(y)
#drop highly correlated features
full.drop(['PoolArea','GarageCond','GarageArea',"TotRmsAbvGrd",'KitchenQual'],axis=1,inplace=True)
numerical_features,categorical_features=get_features(full)
full_wd=full.copy() #wd=with dummy
full_wod=full.copy() #wod= without dummy
full_wd=pd.get_dummies(full_wd)
full_wod=pd.get_dummies(full_wod,drop_first=True)
train_wd=full_wd.iloc[:index,:]
test_wd=full_wd.iloc[index:,:]
train_wod=full_wod.iloc[:index,:]
test_wod=full_wod.iloc[index:,:]
!pip install lightgbm
def model_score(name,model,mtrain,y,cv=50):
#mse are by default negative in sklearn
mse=-cross_val_score(model,mtrain,y,cv=cv,scoring='neg_mean_squared_error')
rmse=np.sqrt(mse)
average_rmse=np.mean(rmse)
std=np.std(rmse)
#print(rmse)
print("%s: The average RMSE is: %f with a standard deviation of %f." %(name,average_rmse,std))
return average_rmse,std
#define models
from sklearn.linear_model import LinearRegression ,ElasticNet, Lasso, LassoLars,HuberRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor,AdaBoostRegressor,BaggingRegressor,ExtraTreesRegressor
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
models=[LinearRegression(),ElasticNet(),Lasso(),LassoLars(),HuberRegressor(),RandomForestRegressor(),\
GradientBoostingRegressor(),AdaBoostRegressor(),BaggingRegressor(), ExtraTreesRegressor(),SVR(),\
LGBMRegressor(),XGBRegressor()]
#LR:Linear Regressor
#EN:Elastic Net
#L:Lasso
#Lar:LassoLars
#HR:HuberRegressor
#RF:Random Forest
#GBR:Gradient Boosting Regressor
#Ada:AdaBoostRegressor
#Bag:BaggingRegressor
#Extra:ExtraTreesRegressor
#SVR:Support Vector Regressor
#LGBMR:Light GBM Regressor
#XGBR:XGB Regressor
names=["LR","EN","L","Lar","HR","RF","GBR","Ada","Bag","Extra", "SVR","LGBMR","XGBR"]
#lists to save rsme and std of each model
errors=[]
stds=[]
cv_=50
#iterate through models,calculate rsme and std,store them in lists
for name,model in zip(names,models):
rsme,std=model_score(name,model,train_wod,log_y,cv_)
errors.append(rsme)
stds.append(std)
#bar plot of model rsme
bar1=plt.bar(names,errors,label="RSME")
bar2=plt.bar(names,stds,label="Variance")
#we change the color of the bar with min rsme and min std to lighter colors
bar1[np.argmin(errors)].set_color('#bae8f5')
bar2[np.argmin(stds)].set_color('#ffcc66')
plt.legend()
#lists to save rsme and std of each model
errors=[]
stds=[]
#iterate through models,calculate rsme and std,store them in lists
for name,model in zip(names,models):
rsme,std=model_score(name,model,train_wd,log_y)
errors.append(rsme)
stds.append(std)
#bar plot of model rsme
bar1=plt.bar(names,errors,label="RSME")
bar2=plt.bar(names,stds,label="Variance")
#we change the color of the bar with min rsme and min std to lighter colors
bar1[np.argmin(errors)].set_color('#bae8f5')
bar2[np.argmin(stds)].set_color('#ffcc66')
plt.legend()
#import cleaned data
full=full_cleaned.copy()
numerical_features,categorical_features=get_features(full)
numerical_features
#MSSubClass should be categorical
full['MSSubClass'] = full['MSSubClass'].apply(str)
#Year and month sold should also be categorical.
full['YrSold'] = full['YrSold'].astype(str)
full['MoSold'] = full['MoSold'].astype(str)
numerical_features,categorical_features=get_features(full)
numerical_features
categorical_features
dict1={'No Pool':0,'Fa':1,'Gd':2,'Ex':3}
full.PoolQC=full.PoolQC.apply(lambda x: dict1[x])
dict1={'NA':0,'Po':1,'TA':2,'Fa':3,'Gd':4,'Ex':5}
full.FireplaceQu=full.FireplaceQu.apply(lambda x: dict1[x])
dict1={'No Garage':0,'Po':1,'TA':2,'Fa':3,'Gd':4,'Ex':5}
full.GarageCond=full.GarageCond.apply(lambda x: dict1[x])
full.GarageQual=full.GarageQual.apply(lambda x: dict1[x])
dict1={'No Garage':0,'Unf':1,'RFn':2,'Fin':3}
full.GarageFinish=full.GarageFinish.apply(lambda x: dict1[x])
dict1={'No Basement':0, 'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4, 'ALQ':5, 'GLQ':6}
full.BsmtFinType1=full.BsmtFinType1.apply(lambda x: dict1[x])
full.BsmtFinType2=full.BsmtFinType2.apply(lambda x: dict1[x])
dict1={'No Basement':0, 'No':1, 'Mn':2, 'Av':3, 'Gd':4}
full.BsmtExposure=full.BsmtExposure.apply(lambda x: dict1[x])
dict1={'No Basement':0,'Po':1,'TA':2,'Fa':3,'Gd':4,'Ex':5}
full.BsmtQual=full.BsmtQual.apply(lambda x: dict1[x])
full.BsmtCond=full.BsmtCond.apply(lambda x: dict1[x])
dict1={'Po':1,'TA':2,'Fa':3,'Gd':4,'Ex':5}
full.ExterQual=full.ExterQual.apply(lambda x: dict1[x])
full.ExterCond=full.ExterCond.apply(lambda x: dict1[x])
full.HeatingQC=full.HeatingQC.apply(lambda x: dict1[x])
full.KitchenQual=full.KitchenQual.apply(lambda x: dict1[x])
full.CentralAir=full.CentralAir.apply(lambda x: 1 if x=='Y' else 0)
#added these
#When home functionality is typical,price is higher and it decreases as functionality decreases.
dict1={'Sal':0, 'Sev':1, 'Maj2':2, 'Maj1':3, 'Mod':4, 'Min2':5, 'Min1':6, 'Typ':7}
full.Functional=full.Functional.apply(lambda x: dict1[x])
#Lands with gentle slope usually cost more than lands with moderate slope which usually cost more than lands with severe slopes.
dict1={'Sev':0,'Mod':1,'Gtl':2}
full.LandSlope=full.LandSlope.apply(lambda x: dict1[x])
#A paved street costs more, so let's encode 'Pave' with 1 and 'Grvl' with 0.
full.Street=full.Street.apply(lambda x: 1 if x=='Pave' else 0)
#A paved drive costs more.
dict1={'N':0,'P':1,'Y':2}
full.PavedDrive=full.PavedDrive.apply(lambda x: dict1[x])
full['Total_Bathrooms']=(full.FullBath+full.HalfBath*0.5+full.BsmtFullBath+full.BsmtHalfBath*0.5)
full['TotalPorchSF']=full.OpenPorchSF+full.EnclosedPorch+full['3SsnPorch']+full.ScreenPorch
#2010 because this dataset was available in 2010
full['Age']=2010-full.YearBuilt
#added these
full["Remodeled"] = (full["YearRemodAdd"] != full["YearBuilt"]) * 1
area_cols = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF',
'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'LowQualFinSF', 'PoolArea' ]
full["TotalArea"] =full[area_cols].sum(axis=1)
numerical_features,categorical_features=get_features(full)
full_hot=pd.get_dummies(full.copy(),drop_first=True)
#lasso feature importances
lasso=Lasso()
lasso.fit(full_hot.copy().iloc[:index,:],log_y)
FI_lasso = pd.DataFrame({"Feature Importance":lasso.coef_}, index=full_hot.columns)
FI_lasso[FI_lasso["Feature Importance"]!=0].sort_values("Feature Importance").plot(kind="barh",figsize=(15,25))
plt.show()
#try with random forest
rf=RandomForestRegressor()
rf.fit(full_hot.copy().iloc[:index,:],log_y)
FI_rf = pd.DataFrame({"Feature Importance":rf.feature_importances_}, index=full_hot.columns)
FI_rf.sort_values("Feature Importance").iloc[-20:,:].plot(kind="barh",figsize=(15,25))
plt.show()
threshold=0.0001
imp_features=FI_rf[FI_rf['Feature Importance']>threshold].index
full_imp=full_hot.copy()[imp_features]
!pip install statsmodels
from statsmodels.stats.outliers_influence import variance_inflation_factor
# For each X, calculate VIF and save in dataframe
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(full_hot.values, i) for i in range(full_hot.shape[1])]
vif["features"] = full_hot.columns
#choose important features
imp_features=vif[vif["VIF Factor"]>5].features.values
full_vif=full_hot.copy()[imp_features]
from scipy.special import boxcox1p
#for full_imp
skew=pd.DataFrame()
skew['Skew']=full_imp.skew()
skewed_columns=skew[skew>0.7].dropna().index
full_imp_skewed=full_imp.copy()
full_imp_skewed[skewed_columns]=boxcox1p(full_imp_skewed[skewed_columns],0.15)
#for full_vif
skew=pd.DataFrame()
skew['Skew']=full_vif.skew()
skewed_columns=skew[skew>0.7].dropna().index
full_vif_skewed=full_vif.copy()
full_vif_skewed[skewed_columns]=boxcox1p(full_vif_skewed[skewed_columns],0.15)
from sklearn.preprocessing import RobustScaler
scaler=RobustScaler()
full_imp_scaled=full_imp.copy()
full_imp_scaled[full_imp.columns]=scaler.fit_transform(full_imp_scaled)
full_imp_skewed_scaled=full_imp_skewed.copy()
full_imp_skewed_scaled[full_imp_skewed.columns]=scaler.fit_transform(full_imp_skewed_scaled)
full_vif_scaled=full_vif.copy()
full_vif_scaled[full_vif_scaled.columns]=scaler.fit_transform(full_vif_scaled)
full_vif_skewed_scaled=full_vif_skewed.copy()
full_vif_skewed_scaled[full_vif_skewed_scaled.columns]=scaler.fit_transform(full_vif_skewed_scaled)
datasets=[full_imp,full_imp_skewed,full_imp_scaled,full_imp_skewed_scaled,full_vif,full_vif_skewed,full_vif_scaled,full_vif_skewed_scaled]
datasets_names=["full_imp","full_imp_skewed","full_imp_scaled","full_imp_skewed_scaled","full_vif","full_vif_skewed","full_vif_scaled","full_vif_skewed_scaled"]
from sklearn.cross_validation import cross_val_score
def model_score(name,model,mtrain,y,cv=50):
#mse are by default negative in sklearn
mse=-cross_val_score(model,mtrain,y,cv=cv,scoring='neg_mean_squared_error')
rmse=np.sqrt(mse)
average_rmse=np.mean(rmse)
std=np.std(rmse)
#print(rmse)
print("%s: The average RMSE is: %f with a standard deviation of %f." %(name,average_rmse,std))
return average_rmse,std
#define models
from sklearn.linear_model import LinearRegression ,ElasticNet, Lasso, LassoLars,HuberRegressor,Ridge,BayesianRidge
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor,AdaBoostRegressor,BaggingRegressor,ExtraTreesRegressor
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
models=[LinearRegression(),ElasticNet(),Lasso(),LassoLars(),HuberRegressor(),RandomForestRegressor(),\
GradientBoostingRegressor(n_estimators=500,loss='huber'),AdaBoostRegressor(),BaggingRegressor(), ExtraTreesRegressor(),SVR(),\
LGBMRegressor(objective='regression',n_estimators=500),XGBRegressor(n_estimators=500),KernelRidge(kernel='polynomial'),Ridge(),BayesianRidge()]
#LR:Linear Regressor
#EN:Elastic Net
#L:Lasso
#Lar:LassoLars
#HR:HuberRegressor
#RF:Random Forest
#GBR:Gradient Boosting Regressor
#Ada:AdaBoostRegressor
#Bag:BaggingRegressor
#Extra:ExtraTreesRegressor
#SVR:Support Vector Regressor
#LGBMR:Light GBM Regressor
#XGBR:XGB Regressor
#KR: Kernel Ridge
#R:Ridge
#BR:Bayesian Ridge
names=["LR","EN","L","Lar","HR","RF","GBR","Ada","Bag","Extra", "SVR","LGBMR","XGBR","KR","R","BR"]
for i in range(len(datasets)):
data=datasets[i]
#get train dataset
train=data.iloc[:index,:]
#lists to save rsme and std of each model
errors=[]
stds=[]
cv_=50
#iterate through models,calculate rsme and std,store them in lists
for name,model in zip(names,models):
rsme,std=model_score(name,model,train,log_y,cv_)
errors.append(rsme)
stds.append(std)
#bar plot of model rsme
plt.figure()
plt.title(datasets_names[i])
plt.ylim([0,0.5])
bar1=plt.bar(names,errors,label="RSME")
bar2=plt.bar(names,stds,label="Variance")
#we change the color of the bar with min rsme and min std to lighter colors
bar1[np.argmin(errors)].set_color('#bae8f5')
bar2[np.argmin(stds)].set_color('#ffcc66')
plt.legend()
plt.show()
from sklearn.model_selection import GridSearchCV
train=full_imp_skewed_scaled.iloc[:index,:]
parameters=[{'n_iter':[300,1000],'tol':[0.1,0.001],'alpha_1':[1e-6,1e-4],\
'lambda_1':[1e-6,1e-4]}]
BR=BayesianRidge()
grid_search= GridSearchCV(estimator=BR,param_grid=parameters,scoring='neg_mean_squared_error',cv=50,n_jobs=-1,verbose=1)
grid_search=grid_search.fit(train,log_y)
print("The best score is %f." %(grid_search.best_score_))
print("The best parameters are ",grid_search.best_params_ )
grid_search.grid_scores_
b=BayesianRidge(alpha_1=1e-6,n_iter=300,lambda_1=0.0001,tol=0.001)
mse=-cross_val_score(b,train,log_y,cv=50,scoring='neg_mean_squared_error')
rmse=np.sqrt(mse)
average_rmse=np.mean(rmse)
std=np.std(rmse)
#print(rmse)
print(" The average RMSE is: %f with a standard deviation of %f." %(average_rmse,std))
parameters=[{'alpha':[1.e-4,0.1,1,5,10],'fit_intercept':[True,False],'tol':[0.001,0.1,1]}]
R=Ridge()
grid_search= GridSearchCV(estimator=R,param_grid=parameters,scoring='neg_mean_squared_error',cv=50,n_jobs=-1,verbose=1)
grid_search=grid_search.fit(train,log_y)
print("The best score is %f." %(grid_search.best_score_))
print("The best parameters are ",grid_search.best_params_ )
R=Ridge(alpha=10,fit_intercept=True,tol=0.001)
mse=-cross_val_score(R,train,log_y,cv=50,scoring='neg_mean_squared_error')
rmse=np.sqrt(mse)
average_rmse=np.mean(rmse)
std=np.std(rmse)
#print(rmse)
print(" The average RMSE is: %f with a standard deviation of %f." %(average_rmse,std))
parameters=[{'loss':['ls','huber'],'learning_rate':[0.1,0.01,1],'n_estimators':[100,300,500],'max_depth':[3,7,10,15]}]
GBR=GradientBoostingRegressor()
grid_search= GridSearchCV(estimator=GBR,param_grid=parameters,scoring='neg_mean_squared_error',cv=50,n_jobs=-1,verbose=1)
grid_search=grid_search.fit(train,log_y)
print("The best score is %f." %(grid_search.best_score_))
print("The best parameters are ",grid_search.best_params_ )
GBR=GradientBoostingRegressor(loss='ls',n_estimators=500,learning_rate=0.1,max_depth=3)
mse=-cross_val_score(GBR,train,log_y,cv=50,scoring='neg_mean_squared_error')
rmse=np.sqrt(mse)
average_rmse=np.mean(rmse)
std=np.std(rmse)
#print(rmse)
print(" The average RMSE is: %f with a standard deviation of %f." %(average_rmse,std))
parameters=[{'booster':['gbtree','dart'],'n_estimators':[100,300],'gamma':[0,0.5,1],'max_depth':[3,7,10],\
'reg_lambda':[0.1,1],'reg_alpha':[0.1,1]}]
XGBR=XGBRegressor()
grid_search= GridSearchCV(estimator=XGBR,param_grid=parameters,scoring='neg_mean_squared_error',cv=50,n_jobs=-1,verbose=1)
grid_search=grid_search.fit(train,log_y)
print("The best score is %f." %(grid_search.best_score_))
print("The best parameters are ",grid_search.best_params_ )
grid_search.grid_scores_
XGBR=XGBRegressor(booster='dart',reg_alpha=1,max_depth=3,reg_lambda=0.1,n_estimators=300,gamma=0)
mse=-cross_val_score(XGBR,train,log_y,cv=50,scoring='neg_mean_squared_error')
rmse=np.sqrt(mse)
average_rmse=np.mean(rmse)
std=np.std(rmse)
#print(rmse)
print(" The average RMSE is: %f with a standard deviation of %f." %(average_rmse,std))
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
#https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
def __init__(self, models):
self.models = models
# we define clones of the original models to fit the data in
def fit(self, X, y):
self.models_ = [clone(x) for x in self.models]
# Train cloned base models
for model in self.models_:
model.fit(X, y)
return self
#Now we do the predictions for cloned models and average them
def predict(self, X):
predictions = np.column_stack([
model.predict(X) for model in self.models_
])
return np.mean(predictions, axis=1)
#average
averaged_models = AveragingModels(models = (b, R, GBR, XGBR))
mse=-cross_val_score(averaged_models,train,log_y,cv=50,scoring='neg_mean_squared_error')
rmse=np.sqrt(mse)
average_rmse=np.mean(rmse)
std=np.std(rmse)
#print(rmse)
print(" The average RMSE is: %f with a standard deviation of %f." %(average_rmse,std))
#https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
def __init__(self, models):
self.models = models
# we define clones of the original models to fit the data in
def fit(self, X, y):
self.models_ = [clone(x) for x in self.models]
# Train cloned base models
for model in self.models_:
model.fit(X, y)
return self
#Now we do the predictions for cloned models and average them
def predict(self, X):
predictions = np.column_stack([
model.predict(X) for model in self.models_
])
w=np.array([0.3,0.3,0.2,0.2])
return np.dot(predictions, w.T)
#average
averaged_models = AveragingModels(models = (b, R, GBR, XGBR))
mse=-cross_val_score(averaged_models,train,log_y,cv=50,scoring='neg_mean_squared_error')
rmse=np.sqrt(mse)
average_rmse=np.mean(rmse)
std=np.std(rmse)
#print(rmse)
print(" The average RMSE is: %f with a standard deviation of %f." %(average_rmse,std))
#https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
def __init__(self, models):
self.models = models
# we define clones of the original models to fit the data in
def fit(self, X, y):
self.models_ = [clone(x) for x in self.models]
# Train cloned base models
for model in self.models_:
model.fit(X, y)
return self
#Now we do the predictions for cloned models and average them
def predict(self, X):
predictions = np.column_stack([
model.predict(X) for model in self.models_
])
w=np.array([0.4,0.3,0.2,0.1])
return np.dot(predictions, w.T)
#average
averaged_models = AveragingModels(models = (b, R, GBR, XGBR))
mse=-cross_val_score(averaged_models,train,log_y,cv=50,scoring='neg_mean_squared_error')
rmse=np.sqrt(mse)
average_rmse=np.mean(rmse)
std=np.std(rmse)
#print(rmse)
print(" The average RMSE is: %f with a standard deviation of %f." %(average_rmse,std))
# credits: https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
def __init__(self, models):
self.models = models
# we define clones of the original models to fit the data in
def fit(self, X, y):
self.models_ = [clone(x) for x in self.models]
# Train cloned base models
for model in self.models_:
model.fit(X, y)
return self
#Now we do the predictions for cloned models and average them
def predict(self, X):
predictions = np.column_stack([
model.predict(X) for model in self.models_
])
w=np.array([0.5,0.2,0.1,0.1])
return np.dot(predictions, w.T)
#average
averaged_models = AveragingModels(models = (b, R, GBR, XGBR))
mse=-cross_val_score(averaged_models,train,log_y,cv=50,scoring='neg_mean_squared_error')
rmse=np.sqrt(mse)
average_rmse=np.mean(rmse)
std=np.std(rmse)
#print(rmse)
print(" The average RMSE is: %f with a standard deviation of %f." %(average_rmse,std))
!pip install keras
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
def baseline_model():
model = Sequential()
model.add(Dense(130, input_dim=130,kernel_initializer='normal', activation='relu'))
model.add(Dense(80, activation='relu',kernel_initializer='normal'))
model.add(Dense(40, activation='relu',kernel_initializer='normal'))
model.add(Dense(20, activation='relu',kernel_initializer='normal'))
model.add(Dense(10, activation='relu',kernel_initializer='normal'))
model.add(Dense(1,kernel_initializer='normal'))
# Compile model
model.compile(loss='mean_squared_error', optimizer='adam')
return model
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)
# evaluate model
estimator = KerasRegressor(build_fn=baseline_model, epochs=100, batch_size=5, verbose=0)
kfold = KFold(n_splits=50, random_state=seed)
results = cross_val_score(estimator, train, log_y, cv=kfold,verbose=1)
print("Results: %.2f (%.2f) MSE" % (results.mean(), results.std()))
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
def __init__(self, models):
self.models = models
# we define clones of the original models to fit the data in
def fit(self, X, y):
self.models_ = [clone(x) for x in self.models]
# Train cloned base models
for model in self.models_:
model.fit(X, y)
return self
#Now we do the predictions for cloned models and average them
def predict(self, X):
#self.models_ = [clone(x) for x in self.models]
predictions = np.column_stack([
model.predict(X) for model in self.models_
])
w=np.array([0.3,0.3,0.2,0.2])
return np.dot(predictions, w.T)
#average
averaged_models = AveragingModels(models = (b, R, GBR, XGBR))
mse=-cross_val_score(averaged_models,train,log_y,cv=50,scoring='neg_mean_squared_error')
rmse=np.sqrt(mse)
average_rmse=np.mean(rmse)
std=np.std(rmse)
#print(rmse)
print(" The average RMSE is: %f with a standard deviation of %f." %(average_rmse,std))
test=full_imp_skewed_scaled.iloc[index:,:]
test.shape
averaged_models.fit(train,log_y)
y_pred=averaged_models.predict(test)
y_pred=np.exp(y_pred)
submissions=pd.DataFrame({'Id':test_ID,'SalePrice':y_pred})
submissions.to_csv('submission.csv',index=False)
!pip install mxnet
import mxnet as mx
from mxnet import gluon
from scipy.stats import skew
from scipy.stats.stats import pearsonr
from mxnet import ndarray as nd
from mxnet import autograd
from mxnet import gluon
X_train,X_test=import_data()
all_X = pd.concat((X_train.loc[:, 'MSSubClass':'SaleCondition'],
X_test.loc[:, 'MSSubClass':'SaleCondition']))
numnic_feats = all_X.dtypes[all_X.dtypes!='object'].index
all_X[numnic_feats] = all_X[numnic_feats].apply(lambda x: (x-x.mean())/x.std())
all_X = pd.get_dummies(all_X, dummy_na=True)
all_X = all_X.fillna(all_X.mean())
num_train = X_train.shape[0]
X_train = all_X[:num_train].as_matrix()
X_test = all_X[num_train:].as_matrix()
y_train = y.as_matrix()
from mxnet import ndarray as nd
from mxnet import autograd
from mxnet import gluon
X_train = nd.array(X_train)
y_train = nd.array(y_train)
y_train.reshape((num_train, 1))
X_test = nd.array(X_test)
square_loss = gluon.loss.L2Loss()
def get_rmse_log(net, X_train,y_train):
num_train = X_train.shape[0]
clipped_preds = nd.clip(net(X_train), 1, float('inf'))
return nd.sqrt(2*nd.sum(square_loss(nd.log(clipped_preds),nd.log(y_train))/num_train)).asscalar()
def get_net():
net = gluon.nn.Sequential()
with net.name_scope():
net.add(gluon.nn.Dense(128, activation='relu'))
net.add(gluon.nn.Dropout(0.01))
# net.add(gluon.nn.Dense(32, activation='relu'))
# net.add(gluon.nn.Dropout(0.2))
net.add(gluon.nn.Dense(1))
net.initialize()
return net
%matplotlib inline
import matplotlib as mpl
mpl.rcParams['figure.dpi']= 120
import matplotlib.pyplot as plt
def train(net, X_train, y_train, X_test, y_test, epochs,
verbose_epoch, learning_rate, weight_decay):
train_loss = []
if X_test is not None:
test_loss = []
batch_size = 100
dataset_train = gluon.data.ArrayDataset(X_train, y_train)
data_iter_train = gluon.data.DataLoader(
dataset_train, batch_size,shuffle=True)
trainer = gluon.Trainer(net.collect_params(), 'adam',
{'learning_rate': learning_rate,
'wd': weight_decay})
net.collect_params().initialize(force_reinit=True)
for epoch in range(epochs):
for data, label in data_iter_train:
with autograd.record():
output = net(data)
loss = square_loss(output, label)
loss.backward()
trainer.step(batch_size)
cur_train_loss = get_rmse_log(net, X_train, y_train)
if epoch > verbose_epoch:
print("Epoch %d, train loss: %f" % (epoch, cur_train_loss))
train_loss.append(cur_train_loss)
if X_test is not None:
cur_test_loss = get_rmse_log(net, X_test, y_test)
test_loss.append(cur_test_loss)
plt.plot(train_loss)
plt.legend(['train'])
if X_test is not None:
plt.plot(test_loss)
plt.legend(['train','test'])
plt.show()
if X_test is not None:
return cur_train_loss, cur_test_loss
else:
return cur_train_loss
def k_fold_cross_valid(k, epochs, verbose_epoch, X_train, y_train,
learning_rate, weight_decay):
assert k > 1
fold_size = X_train.shape[0] // k
train_loss_sum = 0.0
test_loss_sum = 0.0
for test_i in range(k):
X_val_test = X_train[test_i * fold_size: (test_i + 1) * fold_size, :]
y_val_test = y_train[test_i * fold_size: (test_i + 1) * fold_size]
val_train_defined = False
for i in range(k):
if i != test_i:
X_cur_fold = X_train[i * fold_size: (i + 1) * fold_size, :]
y_cur_fold = y_train[i * fold_size: (i + 1) * fold_size]
if not val_train_defined:
X_val_train = X_cur_fold
y_val_train = y_cur_fold
val_train_defined = True
else:
X_val_train = nd.concat(X_val_train, X_cur_fold, dim=0)
y_val_train = nd.concat(y_val_train, y_cur_fold, dim=0)
net = get_net()
train_loss, test_loss = train(
net, X_val_train, y_val_train, X_val_test, y_val_test,epochs, verbose_epoch, learning_rate, weight_decay)
train_loss_sum += train_loss
print("Test loss: %f" % test_loss)
test_loss_sum += test_loss
return train_loss_sum / k, test_loss_sum / k
k = 50
epochs = 100#80#100#80#100
verbose_epoch = 35#75#95
learning_rate = 0.01 #0.001 0.24
weight_decay = 130 #200#80 #100.0 #20
train_loss, test_loss = k_fold_cross_valid(k, epochs, verbose_epoch, X_train,
y_train, learning_rate, weight_decay)
print("%d-fold validation: Avg train loss: %f, Avg test loss: %f" %
(k, train_loss, test_loss))